# -*-coding:utf-8-*-

import gevent.monkey
gevent.monkey.patch_all()
from gevent.pool import Pool

import requests
from lxml import etree
from queue import Queue
import time

class DoubanBookSpider():
    def __init__(self):
        """
        初始化配置
        """
        self.url_temp = "https://book.douban.com"
        self.headers = {
            "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36"
        }
        self.title_dict = {}    # 存储分类信息
        self.books_data = {}    # 存储图书信息
        self.proxies = {
            "http": "http://171.41.80.197:9999",
            "http": "http://110.52.235.227:9999",
            "http": "http://125.123.136.162:9000",
            "http": "https://113.118.159.138:9000"
            }

    def get_classify_list(self):
        """
        获取分类的请求url
        :return:
        """
        response = requests.get(headers = self.headers, url=self.url_temp+"/tag/?view=type&icn=index-sorttags-all", proxies=self.proxies)
        resp_data = response.content.decode()
        return resp_data


    def get_content_list(self, html_str):
        """
        获取分类数据
        :param html_str:
        :return:
        """
        html = etree.HTML(html_str)
        div_list = html.xpath('//*[@id="content"]/div/div[1]/div[2]/div')
        for div in div_list:
            title_list = []
            first = div.xpath('./a/@name')[0]
            # print(first)
            td_list = div.xpath('.//td')
            second_dict = {}
            for td in td_list:
                try:
                    second = td.xpath('./a/text()')[0]
                    second_href = td.xpath('./a/@href')[0]
                    second_dict[second] = second_href
                    # print(second_dict)
                except Exception as e:
                    print(e)

            # print(title_list)
            self.title_dict[first] = second_dict

    def request_book(self,books_info_url):
        """
        请求分类列表页
        :return:
        """
        books_list = requests.get(headers = self.headers, url=self.url_temp+books_info_url, proxies=self.proxies)
        return etree.HTML(books_list.content.decode())

    def get_book_info(self, book_list_html):
        """
        获取每一页的图书信息,不需要翻页
        ISBN(国际标准书号)  ==> 详情页中
        书名:
        图片:
        作者:
        译者:(外籍书)
        出版社:
        发行日期;
        价格:
        评分:
        简介:
        :param book_list_html:
        :return: 
        """
        book_info_list = book_list_html.xpath('//*[@id="subject_list"]/ul/li')
        book_infos = {}
        for book_data in book_info_list:
            book_details_url = book_data.xpath('./div[@class="info"]/h2/a/@href')[0]
            book_name = book_data.xpath('./div[@class="info"]/h2/a/@title')[0]
            book_info = book_data.xpath('./div[@class="info"]/div/text()')[0].strip().split(" / ")
            book_img_url = book_data.xpath('./div[@class="pic"]/a[@class="nbg"]/img/@src')[0]
            book_score = book_data.xpath('./div[@class="info"]/div[@class="star clearfix"]/span[@class="rating_nums"]/text()')[0]
            book_introduce = book_data.xpath('./div[@class="info"]//p/text()')[0].replace('\n', "")

            book_details_html = etree.HTML(requests.get(headers=self.headers, url=book_details_url).content.decode())
            ISBN = book_details_html.xpath('//div[@id="info"]/text()')[-2]

            print(book_name,book_info, book_img_url, book_score, book_introduce, ISBN)
            print("="*30)
            book_info = {
                "book_name":book_name,
                "book_info":book_info,
                "book_img_url":book_img_url,
                "book_score":book_score,
                "book_introduce":book_introduce,
                "ISBN":ISBN
            }
            book_infos[book_name] = book_info
        return book_infos


    def run(self):
        """
        运行爬虫
        :return:
        """
        content_str = self.get_classify_list()
        self.get_content_list(content_str)
        print(self.title_dict)

        # 保存目录信息
        with open("./title_dict.txt", "w") as f:
            f.write(str(self.title_dict))

        for first, second_dict in self.title_dict.items():
            print(first)
            second_titles = {}
            for second_title, books_list_url in second_dict.items():
                try:
                    print("----------{}------------".format(second_title))
                    book_list_html = self.request_book(books_list_url)
                    book_infos = self.get_book_info(book_list_html)
                    second_titles[second_title] = book_infos
                except Exception as e:
                    print(e)
            self.books_data[first] = second_titles
            print("*"*60)

        # 本地保存图书信息
        with open("./bookinfos.txt", "w") as f:
            f.write(str(self.books_data))



if __name__ == '__main__':
    booksspider = DoubanBookSpider()
    booksspider.run()


